In [1]:
import os
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from wordcloud import WordCloud
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

nltk.download('stopwords')
nltk.download('punkt')
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Out[1]:
True
In [2]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

label_encoder = LabelEncoder()

porter=PorterStemmer()

scaler = MaxAbsScaler()

Data Preprocessing¶

In [3]:
positive_folder = 'M:/s8/NLP/project/Dataset/review_polarity/txt_sentoken/pos'
negative_folder = 'M:/s8/NLP/project/Dataset/review_polarity/txt_sentoken/neg'


texts = []
labels = []

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()



def preprocess_text(text):

    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing, removing punctuation, and removing stopwords
    stop_words = set(stopwords.words('english'))
    processed_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.lower() not in string.punctuation]

    # Stemming
    ps = PorterStemmer()
    processed_tokens = [ps.stem(token) for token in processed_tokens]

    # Removing special characters, emojis, and handling contractions
    processed_tokens = [re.sub(r'[^a-zA-Z0-9\s]', '', token) for token in processed_tokens]

    # Handling numerical data
    processed_tokens = ['NUM' if token.isdigit() else token for token in processed_tokens]

    # Joining Tokens
    processed_text = ' '.join(processed_tokens)

    return processed_text
In [4]:
for filename in os.listdir(positive_folder):
    with open(os.path.join(positive_folder, filename), 'r') as file:
        text = file.read()
        processed_text = preprocess_text(text)
        texts.append(processed_text)
        labels.append('pos')


for filename in os.listdir(negative_folder):
    with open(os.path.join(negative_folder, filename), 'r') as file:
        text = file.read()
        processed_text = preprocess_text(text)
        texts.append(processed_text)
        labels.append('neg')

# Create DataFrame
df = pd.DataFrame({'text': texts, 'label': labels})

df.to_csv('preprocessed_data.csv', index=False)
In [5]:
df = pd.read_csv('preprocessed_data.csv')
df
Out[5]:
text label
0 film adapt comic book plenti success whether r... pos
1 everi movi come along suspect studio everi ind... pos
2 ve got mail work alot better deserv order make... pos
3 jaw rare film grab attent show singl imag sc... pos
4 moviemak lot like gener manag nfl team postsal... pos
... ... ...
1995 anyth stigmata taken warn releas similarlyth... neg
1996 john boorman s zardoz goofi cinemat debacl f... neg
1997 kid hall acquir tast took least season watch s... neg
1998 time john carpent great horror director cours ... neg
1999 two parti guy bob head haddaway s danc hit lo... neg

2000 rows × 2 columns

In [6]:
neg_df = df[df['label'] == 'neg']

neg_df
Out[6]:
text label
1000 plot two teen coupl go church parti drink driv... neg
1001 happi bastard s quick movi review damn y2k bug... neg
1002 movi like make jade movi viewer thank invent t... neg
1003 quest camelot warner bro first featurelength... neg
1004 synopsi mental unstabl man undergo psychothera... neg
... ... ...
1995 anyth stigmata taken warn releas similarlyth... neg
1996 john boorman s zardoz goofi cinemat debacl f... neg
1997 kid hall acquir tast took least season watch s... neg
1998 time john carpent great horror director cours ... neg
1999 two parti guy bob head haddaway s danc hit lo... neg

1000 rows × 2 columns

In [7]:
pos_df = df[df['label'] == 'pos']

pos_df
Out[7]:
text label
0 film adapt comic book plenti success whether r... pos
1 everi movi come along suspect studio everi ind... pos
2 ve got mail work alot better deserv order make... pos
3 jaw rare film grab attent show singl imag sc... pos
4 moviemak lot like gener manag nfl team postsal... pos
... ... ...
995 wow movi s everyth movi funni dramat interest ... pos
996 richard gere command actor s alway great film ... pos
997 glori star matthew broderick denzel washingto... pos
998 steven spielberg s second epic film world war ... pos
999 truman trueman burbank perfect name jim carr... pos

1000 rows × 2 columns

In [8]:
df2 = df.copy()


df2['text_word_count']=df2['text'].apply(lambda x:len(x.split()))

numerical_feature_cols=['text_word_count']
plt.figure(figsize=(24,6))
fig = px.histogram(df2, x='text_word_count', nbins=50, color_discrete_sequence=['#6495ED'])
fig.update_layout(title_text="Distribution of Text Word Count", title_x=0.5)
fig.show()
<Figure size 2400x600 with 0 Axes>
In [9]:
plt.figure(figsize=(24,6))
for i,col in enumerate(numerical_feature_cols):
    plt.subplot(1,3,i+1)
    sns.histplot(data=df2,x=col,hue='label',bins=50)
    plt.title(f"Distribution of {col}")
plt.tight_layout()
plt.show()
In [10]:
#Let us first analyze the distribution of the target variable
from IPython.display import display

import matplotlib.pyplot as plt


print('\033[1mTarget Variable Distribution'.center(55))
plt.pie(df2['label'].value_counts(), labels=['Positive','Negative'], counterclock=False, shadow=True,
        explode=[0,0.1], autopct='%1.1f%%', radius=1, startangle=215)
plt.show()
            Target Variable Distribution           
In [11]:
# Calculate average text sequence length
average_length = df2['text'].apply(lambda x: len(x.split())).mean()

# Plot
plt.figure(figsize=(8, 6))
plt.hist(df2['text'].apply(lambda x: len(x.split())), bins=50, color='#6495ED')
plt.axvline(x=average_length, color='red', linestyle='--', label=f'Average Length: {average_length:.2f}')
plt.xlabel('Text Sequence Length')
plt.ylabel('Frequency')
plt.title('Distribution of Text Sequence Length')
plt.legend()
plt.show()
In [12]:
positivedata = df2[df2['label']== 'pos']
positivedata =positivedata['text']
negdata = df2[df2['label']== 'neg']
negdata= negdata['text']

def wordcloud_draw(data, color, s):
    words = ' '.join(data)
    cleaned_word = " ".join([word for word in words.split() if(word!='movie' and word!='film')])
    wordcloud = WordCloud(stopwords=stopwords.words('english'),background_color=color,width=2500,height=2000).generate(cleaned_word)
    plt.imshow(wordcloud)
    plt.title(s)
    plt.axis('off')

plt.figure(figsize=[20,10])
plt.subplot(1,2,1)
wordcloud_draw(positivedata,'white','Most-common Positive words')

plt.subplot(1,2,2)
wordcloud_draw(negdata, 'white','Most-common Negative words')
plt.show()
In [13]:
df['label'] = label_encoder.fit_transform(df['label'])

joblib.dump(label_encoder, f'label_encoder.pkl')

# Check the mapping of classes
print("Class mapping:", label_encoder.classes_)
Class mapping: ['neg' 'pos']
In [14]:
df = shuffle(df, random_state=42)
df.head()
Out[14]:
text label
1860 guess wild bachelor parti gone realli bad woul... 0
353 abund trite recycl movi late NUM tremend deman... 1
1333 hotshot defens attorney kevin lomax keanu reev... 0
905 hedwig john cameron mitchel born boy name hans... 1
1289 ve heard call jaw claw s fair summat plot th... 0
In [15]:
def tokenizer(text):
        return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tfidf=TfidfVectorizer(strip_accents=None,lowercase=True,preprocessor=None,tokenizer=tokenizer_porter,use_idf=True,norm='l2',smooth_idf=True)
y=df.label.values
x=tfidf.fit_transform(df.text)

joblib.dump(tfidf, f'tfidf.pkl')

print(x)
print(y)
M:\Anaconda\Lib\site-packages\sklearn\feature_extraction\text.py:525: UserWarning:

The parameter 'token_pattern' will not be used since 'tokenizer' is not None'

  (0, 26663)	0.07467370329154899
  (0, 27387)	0.0473115491728708
  (0, 4305)	0.045066096646297155
  (0, 25880)	0.048692631577492505
  (0, 15953)	0.058642657480751655
  (0, 18638)	0.03380795767847503
  (0, 29453)	0.04076222683905151
  (0, 8489)	0.0544389676324058
  (0, 26559)	0.035660719138076766
  (0, 29555)	0.10439146620243106
  (0, 9338)	0.04381512611526275
  (0, 3382)	0.05414231883612125
  (0, 481)	0.04065619963533382
  (0, 20314)	0.01992799178965676
  (0, 24749)	0.04793812245881269
  (0, 22410)	0.028924894569869578
  (0, 4620)	0.11010609306834823
  (0, 26370)	0.07870483753381535
  (0, 14951)	0.05149340226537849
  (0, 5512)	0.040815561521280574
  (0, 9380)	0.025302241513689382
  (0, 14516)	0.02365975817322956
  (0, 27020)	0.026367083316675782
  (0, 12551)	0.03448504536762791
  (0, 28864)	0.05045528814188444
  :	:
  (1999, 22888)	0.15164922749549917
  (1999, 29552)	0.10222936860227075
  (1999, 29602)	0.08330362906635722
  (1999, 463)	0.02592208767800416
  (1999, 1174)	0.02370275174195459
  (1999, 29491)	0.05995116004987929
  (1999, 23150)	0.04005238040657449
  (1999, 28781)	0.024636463286650003
  (1999, 17998)	0.022863752201392265
  (1999, 2533)	0.023024110368163108
  (1999, 6461)	0.033230149361004005
  (1999, 15257)	0.07780583353766761
  (1999, 22517)	0.11729191601866788
  (1999, 27238)	0.03354401664768899
  (1999, 8309)	0.051770036725923965
  (1999, 16704)	0.05596976652760347
  (1999, 10501)	0.05047540717468999
  (1999, 5667)	0.042323818841856826
  (1999, 14838)	0.02676715948750948
  (1999, 18867)	0.04157516839880564
  (1999, 10009)	0.028515671615983614
  (1999, 15527)	0.04138320183562037
  (1999, 16823)	0.05908479751779132
  (1999, 902)	0.020029668021230247
  (1999, 17447)	0.10323381657754073
[0 1 0 ... 1 0 0]
In [16]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=80)

print("Shape of x_train:", x_train.shape)
print("Shape of x_test:", x_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)
Shape of x_train: (1600, 30612)
Shape of x_test: (400, 30612)
Shape of y_train: (1600,)
Shape of y_test: (400,)
In [17]:
x_train = scaler.fit_transform(x_train)

# Transform the test data
x_test = scaler.transform(x_test)

joblib.dump(scaler, f'scaler.pkl')

print("Shape of x_train_scaled:", x_train.shape)
print("Shape of x_test_scaled:", x_test.shape)
Shape of x_train_scaled: (1600, 30612)
Shape of x_test_scaled: (400, 30612)

ML Models¶

In [25]:
accuracy = []

Logistic regression model¶

In [27]:
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib
from sklearn.metrics import classification_report

# Define the logistic regression model

train_accuracy = []
test_accuracy = []


# Define the logistic regression model
lr_model = LogisticRegression(solver='liblinear')

# Define the parameter grid for grid search
param_grid = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}


# Perform grid search
grid_search = GridSearchCV(lr_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Get the best model from grid search
best_lr_model = grid_search.best_estimator_

# Predictions on training data
y_train_pred_lr = best_lr_model.predict(x_train)
train_accuracy_lr = accuracy_score(y_train, y_train_pred_lr)
print("Training Accuracy of Logistic Regression Model:", train_accuracy_lr)
train_accuracy.append(train_accuracy_lr)

# Predictions on testing data
y_test_pred_lr = best_lr_model.predict(x_test)
accuracy_lr = accuracy_score(y_test, y_test_pred_lr)
print("Test Accuracy of Logistic Regression Model:", accuracy_lr)
test_accuracy.append(accuracy_lr)



accuracy.append(accuracy_lr)
print("Classification Report:")
print(classification_report(y_test, y_test_pred_lr))


# Save the trained model
joblib.dump(best_lr_model, 'best_lr_model.pkl')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred_lr)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
y_test_probs_lr = best_lr_model.predict_proba(x_test)[:, 1]
fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, y_test_probs_lr)
plt.plot(fpr_lr, tpr_lr, label='Logistic Regression (AUC = {:.2f})'.format(auc(fpr_lr, tpr_lr)))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Precision-Recall Curve
precision_lr, recall_lr, thresholds_lr = precision_recall_curve(y_test, y_test_probs_lr)
plt.plot(recall_lr, precision_lr, label='Logistic Regression')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()


train_sizes, train_scores, test_scores = learning_curve(best_lr_model, x_train, y_train, cv=5, train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy')

test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
train_sizes = np.linspace(0.1, 1.0, 10)

plt.figure(figsize=(8, 6))

# Plotting Testing Accuracy
plt.plot(train_sizes, test_mean, label='Testing Accuracy', color='red')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color='red', alpha=0.1)
plt.xlabel('Number of Training Examples')
plt.ylabel('Accuracy')
plt.title('Testing Accuracy')
plt.legend()

plt.tight_layout()
plt.show()
Training Accuracy of Logistic Regression Model: 0.999375
Test Accuracy of Logistic Regression Model: 0.8675
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       204
           1       0.86      0.88      0.87       196

    accuracy                           0.87       400
   macro avg       0.87      0.87      0.87       400
weighted avg       0.87      0.87      0.87       400

SVM Model¶

In [28]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib
from sklearn.metrics import classification_report

train_accuracy = []
test_accuracy = []

# Define the SVM model
svm_model = SVC(probability=True)

# Define the parameter grid for grid search
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly']}

# Perform grid search
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Get the best model from grid search
best_svm_model = grid_search.best_estimator_

# Predictions on training data
y_train_pred_svm = best_svm_model.predict(x_train)
train_accuracy_svm = accuracy_score(y_train, y_train_pred_svm)
print("Training Accuracy of SVM Model:", train_accuracy_svm)
train_accuracy.append(train_accuracy_svm)

# Predictions on testing data
y_test_pred_svm = best_svm_model.predict(x_test)
accuracy_svm = accuracy_score(y_test, y_test_pred_svm)
print("Test Accuracy of SVM Model:", accuracy_svm)
test_accuracy.append(accuracy_svm)

accuracy.append(accuracy_svm)


print("Classification Report:")
print(classification_report(y_test, y_test_pred_svm))


# Save the trained model
joblib.dump(best_svm_model, 'best_svm_model.pkl')


# Confusion Matrix
conf_matrix_svm = confusion_matrix(y_test, y_test_pred_svm)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_svm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
y_test_probs_svm = best_svm_model.predict_proba(x_test)[:, 1]
fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test, y_test_probs_svm)
plt.plot(fpr_svm, tpr_svm, label='SVM (AUC = {:.2f})'.format(auc(fpr_svm, tpr_svm)))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Precision-Recall Curve
precision_svm, recall_svm, thresholds_svm = precision_recall_curve(y_test, y_test_probs_svm)
plt.plot(recall_svm, precision_svm, label='SVM')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

# Learning Curve
train_sizes, train_scores, test_scores = learning_curve(best_svm_model, x_train, y_train, cv=5, train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy')

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plotting Training Accuracy
axes[0].plot(train_sizes, train_mean, label='Training Accuracy', color='blue')
axes[0].fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color='blue', alpha=0.1)
axes[0].set_xlabel('Number of Training Examples')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Training Accuracy')
axes[0].legend()

# Plotting Testing Accuracy
axes[1].plot(train_sizes, test_mean, label='Testing Accuracy', color='red')
axes[1].fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color='red', alpha=0.1)
axes[1].set_xlabel('Number of Training Examples')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Testing Accuracy')
axes[1].legend()


plt.tight_layout()
plt.show()
Training Accuracy of SVM Model: 0.986875
Test Accuracy of SVM Model: 0.8575
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       204
           1       0.85      0.86      0.85       196

    accuracy                           0.86       400
   macro avg       0.86      0.86      0.86       400
weighted avg       0.86      0.86      0.86       400

XG-Boost Model¶

In [29]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib

train_accuracy = []
test_accuracy = []

# Define the XGBoost model
xgb_model = XGBClassifier()

# Define the parameter grid for grid search
param_grid = {'learning_rate': [0.001, 0.01, 0.1, 1], 'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7]}

# Perform grid search
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Get the best model from grid search
best_xgb_model = grid_search.best_estimator_

# Predictions on training data
y_train_pred_xgb = best_xgb_model.predict(x_train)
train_accuracy_xgb = accuracy_score(y_train, y_train_pred_xgb)
print("Training Accuracy of XGBoost Model:", train_accuracy_xgb)
train_accuracy.append(train_accuracy_xgb)

# Predictions on testing data
y_test_pred_xgb = best_xgb_model.predict(x_test)
accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)
print("Test Accuracy of XGBoost Model:", accuracy_xgb)
test_accuracy.append(accuracy_xgb)


accuracy.append(accuracy_xgb)


print("Classification Report:")
print(classification_report(y_test, y_test_pred_xgb))


# Save the trained model
joblib.dump(best_xgb_model, 'best_xgb_model.pkl')

# Confusion Matrix
conf_matrix_xgb = confusion_matrix(y_test, y_test_pred_xgb)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_xgb, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
y_test_probs_xgb = best_xgb_model.predict_proba(x_test)[:, 1]
fpr_xgb, tpr_xgb, thresholds_xgb = roc_curve(y_test, y_test_probs_xgb)
plt.plot(fpr_xgb, tpr_xgb, label='XGBoost (AUC = {:.2f})'.format(auc(fpr_xgb, tpr_xgb)))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Precision-Recall Curve
precision_xgb, recall_xgb, thresholds_xgb = precision_recall_curve(y_test, y_test_probs_xgb)
plt.plot(recall_xgb, precision_xgb, label='XGBoost')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

# Learning Curve
train_sizes, train_scores, test_scores = learning_curve(best_xgb_model, x_train, y_train, cv=5, train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy')

test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
train_sizes = np.linspace(0.1, 1.0, 10)

plt.figure(figsize=(8, 6))

# Plotting Testing Accuracy
plt.plot(train_sizes, test_mean, label='Testing Accuracy', color='red')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color='red', alpha=0.1)
plt.xlabel('Number of Training Examples')
plt.ylabel('Accuracy')
plt.title('Testing Accuracy')
plt.legend()

plt.tight_layout()
plt.show()
Training Accuracy of XGBoost Model: 1.0
Test Accuracy of XGBoost Model: 0.7925
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80       204
           1       0.79      0.79      0.79       196

    accuracy                           0.79       400
   macro avg       0.79      0.79      0.79       400
weighted avg       0.79      0.79      0.79       400

Random Forest Model¶

In [30]:
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib


ensemble_model = RandomForestClassifier()

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform grid search
grid_search = GridSearchCV(ensemble_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Get the best model from grid search
best_ensemble_model = grid_search.best_estimator_

# Predictions on training data
y_train_pred_ensemble = best_ensemble_model.predict(x_train)
train_accuracy_ensemble = accuracy_score(y_train, y_train_pred_ensemble)
print("Training Accuracy of Ensemble Model:", train_accuracy_ensemble)

# Predictions on testing data
y_test_pred_ensemble = best_ensemble_model.predict(x_test)
accuracy_ensemble = accuracy_score(y_test, y_test_pred_ensemble)
print("Test Accuracy of Ensemble Model:", accuracy_ensemble)


accuracy.append(accuracy_ensemble)

print("Classification Report:")
print(classification_report(y_test, y_test_pred_ensemble))


# Save the trained model
joblib.dump(best_ensemble_model, 'best_ensemble_model.pkl')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred_ensemble)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
y_test_probs_ensemble = best_ensemble_model.predict_proba(x_test)[:, 1]
fpr_ensemble, tpr_ensemble, thresholds_ensemble = roc_curve(y_test, y_test_probs_ensemble)
plt.plot(fpr_ensemble, tpr_ensemble, label='Ensemble Model (AUC = {:.2f})'.format(auc(fpr_ensemble, tpr_ensemble)))
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Precision-Recall Curve
precision_ensemble, recall_ensemble, thresholds_ensemble = precision_recall_curve(y_test, y_test_probs_ensemble)
plt.plot(recall_ensemble, precision_ensemble, label='Ensemble Model')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

# Learning Curve
train_sizes, train_scores, test_scores = learning_curve(best_ensemble_model, x_train, y_train, cv=5, train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy')

test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(8, 6))

# Plotting Testing Accuracy
plt.plot(train_sizes, test_mean, label='Testing Accuracy', color='red')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color='red', alpha=0.1)
plt.xlabel('Number of Training Examples')
plt.ylabel('Accuracy')
plt.title('Testing Accuracy')
plt.legend()

plt.tight_layout()
plt.show()
M:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py:547: FitFailedWarning:


405 fits failed out of a total of 1215.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
405 fits failed with the following error:
Traceback (most recent call last):
  File "M:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "M:\Anaconda\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "M:\Anaconda\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "M:\Anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestClassifier must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.


M:\Anaconda\Lib\site-packages\sklearn\model_selection\_search.py:1051: UserWarning:

One or more of the test scores are non-finite: [     nan      nan      nan      nan      nan      nan      nan      nan
      nan      nan      nan      nan      nan      nan      nan      nan
      nan      nan      nan      nan      nan      nan      nan      nan
      nan      nan      nan 0.75125  0.785625 0.798125 0.75375  0.79
 0.811875 0.76375  0.78875  0.799375 0.756875 0.775625 0.81125  0.77
 0.784375 0.805625 0.771875 0.779375 0.80375  0.766875 0.783125 0.79875
 0.75625  0.7925   0.80125  0.760625 0.7825   0.80625  0.695625 0.720625
 0.76125  0.69875  0.73625  0.768125 0.721875 0.745    0.76625  0.7075
 0.74625  0.780625 0.735625 0.7625   0.77875  0.71125  0.7375   0.7675
 0.681875 0.715625 0.7475   0.7025   0.729375 0.739375 0.68     0.73375
 0.753125      nan      nan      nan      nan      nan      nan      nan
      nan      nan      nan      nan      nan      nan      nan      nan
      nan      nan      nan      nan      nan      nan      nan      nan
      nan      nan      nan      nan 0.741875 0.7925   0.798125 0.73375
 0.77     0.7875   0.739375 0.768125 0.794375 0.740625 0.7725   0.7975
 0.736875 0.76875  0.785625 0.7425   0.783125 0.798125 0.74625  0.766875
 0.785625 0.745    0.7725   0.789375 0.74625  0.779375 0.7925   0.653125
 0.674375 0.719375 0.65     0.66625  0.71     0.64125  0.65375  0.720625
 0.640625 0.6975   0.713125 0.655625 0.681875 0.700625 0.648125 0.69375
 0.71     0.6325   0.675625 0.721875 0.643125 0.6625   0.70125  0.630625
 0.673125 0.716875      nan      nan      nan      nan      nan      nan
      nan      nan      nan      nan      nan      nan      nan      nan
      nan      nan      nan      nan      nan      nan      nan      nan
      nan      nan      nan      nan      nan 0.754375 0.781875 0.79
 0.748125 0.785    0.7875   0.755    0.79375  0.798125 0.74625  0.781875
 0.8      0.766875 0.78     0.79875  0.754375 0.7825   0.79375  0.75375
 0.7825   0.800625 0.770625 0.78     0.799375 0.7625   0.78125  0.800625
 0.681875 0.695    0.72375  0.68375  0.69375  0.745    0.6725   0.704375
 0.735625 0.68625  0.713125 0.74625  0.68125  0.710625 0.74     0.66875
 0.72125  0.736875 0.673125 0.7125   0.74125  0.654375 0.7      0.735
 0.66625  0.713125 0.734375]

Training Accuracy of Ensemble Model: 1.0
Test Accuracy of Ensemble Model: 0.8025
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       204
           1       0.81      0.79      0.80       196

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400

In [31]:
accuracy
Out[31]:
[0.8675, 0.8675, 0.8575, 0.7925, 0.8025]
In [36]:
import matplotlib.pyplot as plt

# List of accuracy scores
accuracy_scores = [0.8575, 0.7925, 0.8675, 0.8025]
models = ['SVM', 'XGBoost', 'logistic regression', 'Random Forest']

plt.figure(figsize=(8, 6))
plt.bar(models, accuracy_scores, color='skyblue')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Accuracy Scores of Different Models')
plt.ylim(0.7, 0.9)  # Adjust the y-axis limits if needed
plt.show()

Test Script¶

In [33]:
def preprocess_text(text):

    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing, removing punctuation, and removing stopwords
    stop_words = set(stopwords.words('english'))
    processed_tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.lower() not in string.punctuation]

    # Stemming
    processed_tokens = [ps.stem(token) for token in processed_tokens]

    # Removing special characters, emojis, and handling contractions
    processed_tokens = [re.sub(r'[^a-zA-Z0-9\s]', '', token) for token in processed_tokens]

    # Handling numerical data
    processed_tokens = ['NUM' if token.isdigit() else token for token in processed_tokens]

    # Joining Tokens
    processed_text = ' '.join(processed_tokens)

    return processed_text


def tokenizer(text):
        return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
In [35]:
test_positive_folder = 'M:/s8/NLP/project/NLP_Sentiment_Analysis/TestScript Dataset/positive'
test_negative_folder = 'M:/s8/NLP/project/NLP_Sentiment_Analysis/TestScript Dataset/negative'


test_texts = []
test_labels = []
test_textsNames = []

for filename in os.listdir(test_positive_folder):
    test_textsNames.append(filename.split('.')[0])
    with open(os.path.join(test_positive_folder, filename), 'r') as file:
        text = file.read()
        processed_text = preprocess_text(text)
        test_texts.append(processed_text)
        test_labels.append('pos')

for filename in os.listdir(test_negative_folder):
    test_textsNames.append(filename.split('.')[0])
    with open(os.path.join(test_negative_folder, filename), 'r') as file:
        text = file.read()
        processed_text = preprocess_text(text)
        test_texts.append(processed_text)
        test_labels.append('neg')


Model = joblib.load('best_lr_model.pkl')

tfidf_test = joblib.load('tfidf.pkl')
scaler_test = joblib.load('scaler.pkl')
label_encoder_test = joblib.load('label_encoder.pkl')

# applying tfidf on x
x_true = tfidf_test.transform(test_texts)

# applying normalization on x using MaxAbsScaler
x_true = scaler_test.transform(x_true)

# applying One-Hot Encoding on True y
y_true = label_encoder_test.transform(test_labels)

y_predicted = []


# Loop through each text in the data
for text_ in range(x_true.shape[0]):

    # Predict
    Prediction = Model.predict(x_true[text_])
    y_predicted.append(Prediction)

    if Prediction == 0:
        print("Text ", test_textsNames[text_], " is Negative")
    elif Prediction == 1:
        print("Text ", test_textsNames[text_], " is Positive")


Accuracy = accuracy_score(y_true, y_predicted)
Precision, Recall, F1Score, _ = precision_recall_fscore_support(y_true, y_predicted, average='weighted')

print("Sentiment Prediction : ")
print("Accuracy : ", Accuracy)
print("Precision : ", Precision)
print("Recall : ", Recall)
print("F1-score  : ", F1Score)
Text  cv010_29198  is Positive
Text  cv015_29439  is Positive
Text  cv028_26964  is Negative
Text  cv030_22893  is Negative
Sentiment Prediction : 
Accuracy :  1.0
Precision :  1.0
Recall :  1.0
F1-score  :  1.0